Spaceship Tiantic with 0.81295 Accuracy¶
1. 导入包和数据¶
In [434]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import RobustScaler, StandardScaler
from catboost import CatBoostClassifier
import warnings
warnings.filterwarnings("ignore")
In [435]:
train_dataset = pd.read_csv('train.csv')
test_dataset = pd.read_csv('test.csv')
wdata = pd.concat([train_dataset.drop('Transported', axis = 1), test_dataset], axis=0, sort=False)
2. 数据预处理¶
In [436]:
train_dataset.head(5)
Out[436]:
PassengerId | HomePlanet | CryoSleep | Cabin | Destination | Age | VIP | RoomService | FoodCourt | ShoppingMall | Spa | VRDeck | Name | Transported | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0001_01 | Europa | False | B/0/P | TRAPPIST-1e | 39.0 | False | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Maham Ofracculy | False |
1 | 0002_01 | Earth | False | F/0/S | TRAPPIST-1e | 24.0 | False | 109.0 | 9.0 | 25.0 | 549.0 | 44.0 | Juanna Vines | True |
2 | 0003_01 | Europa | False | A/0/S | TRAPPIST-1e | 58.0 | True | 43.0 | 3576.0 | 0.0 | 6715.0 | 49.0 | Altark Susent | False |
3 | 0003_02 | Europa | False | A/0/S | TRAPPIST-1e | 33.0 | False | 0.0 | 1283.0 | 371.0 | 3329.0 | 193.0 | Solam Susent | False |
4 | 0004_01 | Earth | False | F/1/S | TRAPPIST-1e | 16.0 | False | 303.0 | 70.0 | 151.0 | 565.0 | 2.0 | Willy Santantines | True |
In [438]:
def fill_nan(df):
# for columns which has 0 values in 'Age', fill its 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck' with 0
df["RoomService"] = np.where((df["Age"] < 13) | (df["CryoSleep"] == True), 0, df["RoomService"])
df["FoodCourt"] = np.where((df["Age"] < 13) | (df["CryoSleep"] == True), 0, df["FoodCourt"])
df["ShoppingMall"] = np.where((df["Age"] < 13) | (df["CryoSleep"] == True), 0, df["ShoppingMall"])
df["Spa"] = np.where((df["Age"] < 13) | (df["CryoSleep"] == True), 0, df["Spa"])
df["VRDeck"] = np.where((df["Age"] < 13) | (df["CryoSleep"] == True), 0, df["VRDeck"])
df["VIP"] = np.where((df["Age"] < 13) | (df["CryoSleep"] == True), 0, df["VIP"])
# fill categorical columns with mode
categorial_columns = ['HomePlanet', 'Destination', 'CryoSleep', 'VIP']
for col in categorial_columns:
df[col] = df[col].fillna(df[col].mode()[0])
ill_columns = ['Name', 'Cabin']
for col in ill_columns:
df[col].fillna(method='bfill', inplace=True)
# fill numerical columns with mean
numerical_columns = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
for col in numerical_columns:
df[col] = df[col].fillna(0)
df['Age'] = df['Age'].fillna(df[df['Age'] < 61]['Age'].mean())
return df
# train_dataset = fill_nan(train_dataset)
# test_dataset = fill_nan(test_dataset)
wdata = fill_nan(wdata)
print(wdata.isnull().sum())
# print(train_dataset.isnull().sum())
PassengerId 0 HomePlanet 0 CryoSleep 0 Cabin 0 Destination 0 Age 0 VIP 0 RoomService 0 FoodCourt 0 ShoppingMall 0 Spa 0 VRDeck 0 Name 0 dtype: int64
In [439]:
def data_preprocessing(df):
# boolean columns to int
df['CryoSleep'] = df['CryoSleep'].astype(int)
df['VIP'] = df['VIP'].astype(int)
# Cabin has three values, build three columns to show three different values
df[['CabinDeck', 'CabinNumber', 'CabinSide']]=df['Cabin'].str.split("/", expand=True)
df['CabinDeckA'] = (df['CabinDeck'] == 'A').astype(int)
df['CabinDeckG'] = (df['CabinDeck'] == 'G').astype(int)
df['CabinDeckT'] = (df['CabinDeck'] == 'T').astype(int)
df['CabinDeckB'] = (df['CabinDeck'] == 'B').astype(int)
df['CabinDeckC'] = (df['CabinDeck'] == 'C').astype(int)
df['CabinDeckD'] = (df['CabinDeck'] == 'D').astype(int)
df['CabinDeckE'] = (df['CabinDeck'] == 'E').astype(int)
df['CabinDeckF'] = (df['CabinDeck'] == 'F').astype(int)
# # if the cabin is B or C, then CabinDeckBC = 1
# df['CabinDeckBC'] = ((df['CabinDeck'] == 'B') | (df['CabinDeck'] == 'C')).astype(int)
# # if the cabin is D or E or F, then CabinDeckDEF = 1
# df['CabinDeckDEF'] = ((df['CabinDeck'] == 'D') | (df['CabinDeck'] == 'E') | (df['CabinDeck'] == 'F')).astype(int)
# df['CabinSide'] = df['CabinSide'].replace({'P': 1, 'S': 2})
df['CabinSideP'] = (df['CabinSide'] == 'P').astype(int)
df['CabinSideS'] = (df['CabinSide'] == 'S').astype(int)
# Homeplant has 3 values, replace three different values with 1,2,3
df['HomePlanet'] = df['HomePlanet'].replace({'Earth': 1, 'Mars': 2, 'Europa': 3})
# df['HomePlanetE'] = (df['HomePlanet'] == 'Earth').astype(int)
# df['HomePlanetM'] = (df['HomePlanet'] == 'Mars').astype(int)
# df['HomePlanetEu'] = (df['HomePlanet'] == 'Europa').astype(int)
# Destination has 3 values, build three columns to show three different values
df['Destination'] = df['Destination'].replace({'TRAPPIST-1e': 1, '55 Cancri e': 2, 'PSO J318.5-22': 3})
# sum money spent on all items: RoomService, FoodCourt, ShoppingMall, Spa, VRDeck.
df['TotalSpent'] = df['RoomService'] + df['FoodCourt'] + df['ShoppingMall'] + df['Spa'] + df['VRDeck']
# df['TotslSpent'] = np.log(df['TotalSpent'] + 1)
df['Luxury'] = df['RoomService'] + df['Spa'] + df['VRDeck']
df['Normal'] = df['FoodCourt'] + df['ShoppingMall']
# df['Spent'] = (df['TotalSpent'] > 0).astype(int)
# append feature which is about if the passenger is alone
df['PassengerId_group'] = df['PassengerId'].str[0:3]
df['Group_size'] = df['PassengerId_group'].map(df['PassengerId_group'].value_counts())
df['IsAlone'] = (df['Group_size'] == 1).astype(int)
df["FamilyName"] = df["Name"].str.split(' ', expand = True)[1]
df['Family_size'] = df['FamilyName'].map(df['FamilyName'].value_counts())
df['NoFamily'] = (df['Family_size'] == 1).astype(int)
# add column which is about if the passenger is below 4
# df['Age_below_4'] = (df['Age'] < 4).astype(int)
# df['Aeg_below_19'] = (df['Age'] < 19).astype(int)
# drop columns
df = df.drop(['PassengerId', 'PassengerId_group', 'FamilyName', 'Name', 'Group_size', 'Family_size',
'Cabin', 'Destination', 'HomePlanet', 'CabinDeck', 'CabinNumber', 'CabinSide'], axis=1)
# df = df.drop(['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], axis=1)
return df
# train_dataset = data_preprocessing(train_dataset)
# test_dataset_features = data_preprocessing(test_dataset)
# train_dataset_features = train_dataset.drop('Transported', axis=1)
wdata = data_preprocessing(wdata)
wdata.head(5)
Out[439]:
CryoSleep | Age | VIP | RoomService | FoodCourt | ShoppingMall | Spa | VRDeck | CabinDeckA | CabinDeckG | ... | CabinDeckD | CabinDeckE | CabinDeckF | CabinSideP | CabinSideS | TotalSpent | Luxury | Normal | IsAlone | NoFamily | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 39.0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0.0 | 0.0 | 0.0 | 0 | 0 |
1 | 0 | 24.0 | 0 | 109.0 | 9.0 | 25.0 | 549.0 | 44.0 | 0 | 0 | ... | 0 | 0 | 1 | 0 | 1 | 736.0 | 702.0 | 34.0 | 0 | 0 |
2 | 0 | 58.0 | 1 | 43.0 | 3576.0 | 0.0 | 6715.0 | 49.0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 10383.0 | 6807.0 | 3576.0 | 0 | 0 |
3 | 0 | 33.0 | 0 | 0.0 | 1283.0 | 371.0 | 3329.0 | 193.0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 5176.0 | 3522.0 | 1654.0 | 0 | 0 |
4 | 0 | 16.0 | 0 | 303.0 | 70.0 | 151.0 | 565.0 | 2.0 | 0 | 0 | ... | 0 | 0 | 1 | 0 | 1 | 1091.0 | 870.0 | 221.0 | 0 | 0 |
5 rows × 23 columns
In [440]:
rs=RobustScaler()
ss=StandardScaler()
# wdata = rs.fit_transform(wdata)
wdata = ss.fit_transform(wdata)
In [441]:
train_dataset_features = wdata[:len(train_dataset)]
test_dataset_features = wdata[len(train_dataset):]
train_dataset_labels = train_dataset['Transported'].astype(int)
# train_dataset_features, val_dataset_features = train_test_split(train_dataset_features, test_size=0.2, random_state=42)
# train_dataset_labels, val_dataset_labels = train_test_split(train_dataset_labels, test_size=0.2, random_state=42)
print(train_dataset_features.shape)
# train_dataset_features.head(5)
(8693, 23)
3. 开始分类训练¶
决策树分类¶
In [445]:
# from sklearn.tree import DecisionTreeClassifier
# dt = DecisionTreeClassifier()
# dt.fit(train_dataset_features_pca, train_dataset_labels)
# dt_prediction = dt.predict(test_dataset_features_pca).astype(bool)
# output = pd.DataFrame({'PassengerId': test_dataset['PassengerId'], 'Transported': dt_prediction})
# output.to_csv('submission.csv', index=False)
逻辑斯蒂回归¶
In [446]:
# Logistic Regression
# from sklearn.linear_model import LogisticRegression
# lr = LogisticRegression()
# lr.fit(train_dataset_features_pca, train_dataset_labels)
# print("This is the score of the training set: ", lr.score(train_dataset_features_pca, train_dataset_labels))
# print("This is the score of the validation set: ", lr.score(val_dataset_features_pca, val_dataset_labels))
# lr_prediction = lr.predict(test_dataset_features_pca).astype(bool)
# lr_output = pd.DataFrame({'PassengerId': test_dataset['PassengerId'], 'Transported': lr_prediction})
# lr_output.to_csv('lr_submission.csv', index=False)
SVM分类¶
In [447]:
# from sklearn.svm import SVC
# svc = SVC(kernel='rbf')
# svc.fit(train_dataset_features_pca, train_dataset_labels)
# print("This is the score of the training set: " + str(svc.score(train_dataset_features_pca, train_dataset_labels)))
# print("This is the score of the validation set: " + str(svc.score(val_dataset_features_pca, val_dataset_labels)))
# svc_prediction = svc.predict(test_dataset_features_pca).astype(bool)
# svc_output = pd.DataFrame({'PassengerId': test_dataset['PassengerId'], 'Transported': svc_prediction})
# svc_output.to_csv('svc_submission.csv', index=False)
多层感知机分类¶
In [448]:
# # mlp
# from sklearn.neural_network import MLPClassifier
# mlp = MLPClassifier(hidden_layer_sizes=(100, 70), activation='relu', solver='adam', learning_rate='adaptive', batch_size=16, alpha=1)
# mlp.fit(train_dataset_features_pca, train_dataset_labels)
# print("This is the score of the training set: " + str(mlp.score(train_dataset_features_pca, train_dataset_labels)))
# print("This is the score of the validation set: " + str(mlp.score(val_dataset_features_pca, val_dataset_labels)))
# mlp_prediction = mlp.predict(test_dataset_features_pca).astype(bool)
# mlp_output = pd.DataFrame({'PassengerId': test_dataset['PassengerId'], 'Transported': mlp_prediction})
# mlp_output.to_csv('mlp_submission.csv', index=False)
集成学习¶
In [449]:
# from sklearn.ensemble import RandomForestClassifier
# rf = RandomForestClassifier(n_estimators=10, max_depth=20, min_samples_leaf=20)
# rf.fit(train_dataset_features_pca, train_dataset_labels)
# print("This is the score of the training set: " + str(rf.score(train_dataset_features_pca, train_dataset_labels)))
# print("This is the score of the validation set: " + str(rf.score(val_dataset_features_pca, val_dataset_labels)))
# rf_prediction = rf.predict(test_dataset_features_pca).astype(bool)
# rf_output = pd.DataFrame({'PassengerId': test_dataset['PassengerId'], 'Transported': rf_prediction})
# rf_output.to_csv('rf_submission.csv', index=False)
In [450]:
# rf = RandomForestClassifier(n_estimators=60, max_depth=20, min_samples_leaf=20, criterion='entropy')
# rf.fit(train_dataset_features, train_dataset_labels)
# print("This is the score of the training set: " + str(rf.score(train_dataset_features, train_dataset_labels)))
# # print("This is the score of the validation set: " + str(rf.score(val_dataset_features, val_dataset_labels)))
# rf_prediction = rf.predict(test_dataset_features).astype(bool)
# rf_output = pd.DataFrame({'PassengerId': test_dataset['PassengerId'], 'Transported': rf_prediction})
# rf_output.to_csv('rf_submission.csv', index=False)
Catboost¶
In [451]:
# import optuna
# from sklearn.model_selection import KFold, cross_val_score
# kf = KFold(n_splits = 10)
# def objective(trial):
# params = {
# 'iterations': trial.suggest_int("iterations", 50, 1000),
# 'learning_rate': trial.suggest_float("learning_rate", 1e-3, 1e-1, log=True),
# 'depth': trial.suggest_int("depth", 4, 10),
# 'l2_leaf_reg': trial.suggest_float("l2_leaf_reg", 1e-8, 100.0, log=True),
# 'bootstrap_type': trial.suggest_categorical("bootstrap_type", ["Bayesian"]),
# 'random_strength': trial.suggest_float("random_strength", 1e-8, 10.0, log=True),
# 'bagging_temperature':trial.suggest_float("bagging_temperature", 0.0, 10.0),
# 'od_type': trial.suggest_categorical("od_type", ["IncToDec", "Iter"]),
# 'od_wait':trial.suggest_int("od_wait", 10, 50),
# }
# clf = CatBoostClassifier(**params, verbose = 0, random_seed = 0)
# clf.fit(train_dataset_features_pca, train_dataset_labels)
# scores = cross_val_score(clf, train_dataset_features_pca, train_dataset_labels,
# cv = kf, scoring = 'accuracy', n_jobs=-1)
# return np.mean(scores)
# study = optuna.create_study(direction = 'maximize')
# study.optimize(objective, n_trials = 100)
# print('Best hyperparameters:', study.best_params)
# print('Best Acuuracy:', study.best_value)
上面的代码是训练参数,下面得到最好的参数¶
In [452]:
param = {'iterations': 997, 'learning_rate': 0.006179823417619039, 'depth': 5, 'l2_leaf_reg': 0.015049126492951247, 'bootstrap_type': 'Bayesian', 'random_strength': 0.0031066667545993952, 'bagging_temperature': 0.45869966946262664, 'od_type': 'IncToDec', 'od_wait': 42}
In [454]:
cat = CatBoostClassifier(**param, eval_metric='Accuracy', verbose=0)
cat.fit(train_dataset_features, train_dataset_labels)
print("This is the score of the training set: " + str(cat.score(train_dataset_features, train_dataset_labels)))
# print("This is the score of the validation set: " + str(cat.score(val_dataset_features, val_dataset_labels)))
This is the score of the training set: 0.8253767399056712
线性鉴别分析¶
In [457]:
# # LDA
# from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# lda = LinearDiscriminantAnalysis()
# lda.fit(train_dataset_features_pca, train_dataset_labels)
# lda.score(val_dataset_features_pca, val_dataset_labels)
神经网络¶
In [458]:
# from tensorflow import keras
# from keras import layers
# model = keras.Sequential([
# layers.Dense(10, activation='relu', input_shape=[3]),
# layers.Dropout(0.3),
# layers.Dense(20, activation='relu'),
# # dropout
# layers.Dropout(0.2),
# layers.Dense(1, activation='sigmoid'),
# ])
# model.compile(
# optimizer='adam',
# loss='binary_crossentropy',
# metrics=['binary_accuracy'],
# )
# history = model.fit(
# train_dataset_features_pca, train_dataset_labels,
# validation_data=(val_dataset_features_pca, val_dataset_labels),
# batch_size=256,
# epochs=1000,
# verbose=0, # hide the output because we have so many epochs
# )
# history_df = pd.DataFrame(history.history)
# # Start the plot at epoch 5
# history_df.loc[5:, ['loss', 'val_loss']].plot()
# history_df.loc[5:, ['binary_accuracy', 'val_binary_accuracy']].plot()
# print(("Best Validation Loss: {:0.4f}" +\
# "\nBest Validation Accuracy: {:0.4f}")\
# .format(history_df['val_loss'].min(),
# history_df['val_binary_accuracy'].max()))
4. Output¶
In [459]:
cat_prediction = cat.predict(test_dataset_features).astype(bool)
cat_output = pd.DataFrame({'PassengerId': test_dataset['PassengerId'], 'Transported': cat_prediction})
cat_output.to_csv('cat_submission.csv', index=False)